Run MED
Paths and libraries setting
# Load main packages, paths and custom functions
source("../../../source/main_packages.R")
source("../../../source/paths.R")
source("../../../source/functions.R")
# Load supplementary packages
packages <- c("Biostrings", "seqinr")
invisible(lapply(packages, require, character.only = TRUE))
metadata <- read.csv(paste0(path_metadata,"/metadata_08_02_2021.csv"), sep=";")
rownames(metadata) <- metadata$SamplePrepare data for MED
To be run in MED workflow, data needs to be in this form (https://merenlab.org/2012/05/11/oligotyping-pipeline-explained/#preparing-the-fasta-file) :
>Sample-01_ReadX
GTTGAAAAAGTTAGTGGTGAAATCCCAGA
>Sample-01_ReadY
GTTGAAAAAGTTAGTGGTGAAATCCCAGA
>Sample-01_ReadZ
GGTGAAAAAGTTAGTGGTGAAATCCCAGA
>Sample-02_ReadN
GTTGAAAAAGTTAGTGGTGAAATCCCAGA
>Sample-02_ReadM
GTTGAAAAAGTTAGTGGTGAAATCCCAGA
Combine merged files
Load the fasta file from merge step
List and remove samples with less than 1000 reads except to NP20, NP29, NP30, NP34 et NP36
nmax = 1000
colnames(read_by_sample)[1] <- "Sample"
metadata <- metadata %>% merge(read_by_sample, by="Sample")
bad_samples <- metadata[metadata$nread<nmax,]
toremove <- c("NP20", "NP29", "NP30", "NP34", "NP36")
bad_samples <- bad_samples[!(bad_samples$Sample %in% toremove), ]
metadata <- metadata[!metadata$Sample %in% bad_samples$Sample, ]Number of read in each sample
#guide_italics <- guides(fill = guide_legend(label.theme = element_text(size = 10, face = "italic", colour = "Black", angle = 0)))
p <- ggplot(metadata, aes(x = Sample, y = nread))+
geom_bar(position = "dodge", stat = "identity")+
theme(axis.text.x = element_text(angle = 90),
legend.title = element_text(size = 18),
legend.position="bottom",
legend.text=element_text(size=11),
panel.spacing.y=unit(1, "lines"),
panel.spacing.x=unit(0.8, "lines"),
panel.spacing=unit(0,"lines"),
strip.background=element_rect(color="grey30", fill="grey90"),
panel.border=element_rect(color="grey90"),
axis.ticks.x=element_blank(),
strip.text.x = element_text(size = 12)) +
guides(fill=guide_legend(title="Sample", label.theme = element_text(size = 10, face = "italic",
colour = "Black", angle = 0)))+
facet_wrap(~Species+Location+Organ, scales = "free_x", ncol=4)+
labs(y="Sequence counts")+
ylim(0, 900000)+
geom_text(aes(label=nread),
position=position_dodge(width=0.5),
size=2, hjust=-0.25, vjust=0.25, angle=90)+
scale_fill_manual(values = col)+
theme_bw() +
ggtitle("")
pTotal number of read and samples in the final file
# Merge of fasta_df and metadata
fasta_df$Sample_only <- sub("_.*", "", fasta_df$Sample)
fasta_df <- droplevels(fasta_df)
fasta_df_final <- fasta_df[fasta_df$Sample_only %in% metadata$Sample,]
fasta_df_final <- droplevels(fasta_df_final)
cat(paste0("Number of reads: ", nrow(fasta_df_final), "\n"))## Number of reads: 6801022
cat(paste0("Number of samples: ", levels(fasta_df_final$Sample_only %>% as.factor()) %>% length(), "\n"))## Number of samples: 123
Save fasta and list of the remove samples
setwd(path_fasta)
# objet R
save(fasta_df_final, file=paste0("1A_MED_sequences.RData"))
# write fasta
write.fasta(sequences=as.list(fasta_df_final$Seq), names=fasta_df_final$Sample, file.out=paste0("1A_MED_sequences.fasta"))
# write list of the removed samples
write.table(bad_samples, file=paste0("1A_MED_removed_samples.tsv"), sep="\t", row.names=FALSE, quote = FALSE)Run MED
Docker
In a terminal session :
# move in repertory with the fasta file
cd /Volumes/MY_PASSPORT/draft_final/output/1_fasta
# run docker session
sudo docker run --rm -it -v `pwd`:`pwd` -w `pwd` -p 8080:8080 meren/oligotyping:latest
Info from fasta
# get info from fasta
o-get-sample-info-from-fasta 1A_MED_sequences.fasta
Gaps
# fill gaps
o-pad-with-gaps 1A_MED_sequences.fasta
Check info from fasta after gaps
# get info from fasta
o-get-sample-info-from-fasta 1A_MED_sequences.fasta-PADDED-WITH-GAPS
Decompose
# get info from fasta
decompose 1A_MED_sequences.fasta-PADDED-WITH-GAPS
# exit
exit
Move results
cd /Volumes/MY_PASSPORT/draft_final/output
mkdir 1_MED
cd /Volumes/MY_PASSPORT/draft_final/output/1_fasta
mv 1A_MED_sequences-m0.10-A0-M0-d4 ../1_MED
Results
Results of MED are stored in the 1A_MED_sequences-m0.10-A0-M0-d4 folder.